Setup

We will first begin by loading in the packages we intend to use.

Then, importing the data using a URL directly from the source, ensures we will capture updates to the data as they come in, whenever this is run again.

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.width = 12, fig.height = 8) 

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(ggplot2)

import_url <- read.csv("https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD")

Transformation and Exploratory Data Analysis (EDA)

Let’s take a look at the dimensions of this imported data.frame, as well as the variable types of each column, and summary.

dim(import_url)
## [1] 25596    19
str(import_url)
## 'data.frame':    25596 obs. of  19 variables:
##  $ INCIDENT_KEY           : int  236168668 231008085 230717903 237712309 224465521 228252164 226950018 237710987 224701998 225295736 ...
##  $ OCCUR_DATE             : chr  "11/11/2021" "07/16/2021" "07/11/2021" "12/11/2021" ...
##  $ OCCUR_TIME             : chr  "15:04:00" "22:05:00" "01:09:00" "13:42:00" ...
##  $ BORO                   : chr  "BROOKLYN" "BROOKLYN" "BROOKLYN" "BROOKLYN" ...
##  $ PRECINCT               : int  79 72 79 81 113 113 42 52 34 75 ...
##  $ JURISDICTION_CODE      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LOCATION_DESC          : chr  "" "" "" "" ...
##  $ STATISTICAL_MURDER_FLAG: chr  "false" "false" "false" "false" ...
##  $ PERP_AGE_GROUP         : chr  "" "45-64" "<18" "" ...
##  $ PERP_SEX               : chr  "" "M" "M" "" ...
##  $ PERP_RACE              : chr  "" "ASIAN / PACIFIC ISLANDER" "BLACK" "" ...
##  $ VIC_AGE_GROUP          : chr  "18-24" "25-44" "25-44" "25-44" ...
##  $ VIC_SEX                : chr  "M" "M" "M" "M" ...
##  $ VIC_RACE               : chr  "BLACK" "ASIAN / PACIFIC ISLANDER" "BLACK" "BLACK" ...
##  $ X_COORD_CD             : num  996313 981845 996546 1001139 1050710 ...
##  $ Y_COORD_CD             : num  187499 171118 187436 192775 184826 ...
##  $ Latitude               : num  40.7 40.6 40.7 40.7 40.7 ...
##  $ Longitude              : num  -74 -74 -74 -73.9 -73.8 ...
##  $ Lon_Lat                : chr  "POINT (-73.95650899099996 40.68131820000008)" "POINT (-74.00866668999998 40.63636384100005)" "POINT (-73.95566903799994 40.68114495900005)" "POINT (-73.939095905 40.69579171600003)" ...
summary(import_url)
##   INCIDENT_KEY        OCCUR_DATE         OCCUR_TIME            BORO          
##  Min.   :  9953245   Length:25596       Length:25596       Length:25596      
##  1st Qu.: 61593633   Class :character   Class :character   Class :character  
##  Median : 86437258   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :112382648                                                           
##  3rd Qu.:166660833                                                           
##  Max.   :238490103                                                           
##                                                                              
##     PRECINCT      JURISDICTION_CODE LOCATION_DESC      STATISTICAL_MURDER_FLAG
##  Min.   :  1.00   Min.   :0.0000    Length:25596       Length:25596           
##  1st Qu.: 44.00   1st Qu.:0.0000    Class :character   Class :character       
##  Median : 69.00   Median :0.0000    Mode  :character   Mode  :character       
##  Mean   : 65.87   Mean   :0.3316                                              
##  3rd Qu.: 81.00   3rd Qu.:0.0000                                              
##  Max.   :123.00   Max.   :2.0000                                              
##                   NA's   :2                                                   
##  PERP_AGE_GROUP       PERP_SEX          PERP_RACE         VIC_AGE_GROUP     
##  Length:25596       Length:25596       Length:25596       Length:25596      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    VIC_SEX            VIC_RACE           X_COORD_CD        Y_COORD_CD    
##  Length:25596       Length:25596       Min.   : 914928   Min.   :125757  
##  Class :character   Class :character   1st Qu.:1000011   1st Qu.:182782  
##  Mode  :character   Mode  :character   Median :1007715   Median :194038  
##                                        Mean   :1009455   Mean   :207894  
##                                        3rd Qu.:1016838   3rd Qu.:239429  
##                                        Max.   :1066815   Max.   :271128  
##                                                                          
##     Latitude       Longitude        Lon_Lat         
##  Min.   :40.51   Min.   :-74.25   Length:25596      
##  1st Qu.:40.67   1st Qu.:-73.94   Class :character  
##  Median :40.70   Median :-73.92   Mode  :character  
##  Mean   :40.74   Mean   :-73.91                     
##  3rd Qu.:40.82   3rd Qu.:-73.88                     
##  Max.   :40.91   Max.   :-73.70                     
## 

Feature Removal and Renaming

Looks like we have 19 columns (features) and 25596 rows (data points).
First, let’s remove any features that we won’t be needing for our analysis.

  1. JURISDICTION_CODE is pretty broad for localizing shooting incidents so we will end up using BORO instead which will give more insight to our analysis.
  2. X_COORD_CD, Y_COORD_CD, and Lon_Lat are all redundant.

Also, let’s rename a few of these for more readability.

import_url <- select(import_url, -JURISDICTION_CODE, -X_COORD_CD, -Y_COORD_CD, -Lon_Lat)
import_url <- import_url %>%
        rename(c('DATE' = 'OCCUR_DATE', 'TIME' = 'OCCUR_TIME','BOROUGH' = 'BORO', 
                'LOCATION' = 'LOCATION_DESC', 'MURDER_FLAG' = 'STATISTICAL_MURDER_FLAG', 
                'VICTIM_AGE' = 'VIC_AGE_GROUP', 'VICTIM_SEX' = 'VIC_SEX', 'VICTIM_RACE' = 'VIC_RACE',
                'LATITUDE' = 'Latitude', 'LONGITUDE' = 'Longitude'))
head(import_url)
##   INCIDENT_KEY       DATE     TIME  BOROUGH PRECINCT LOCATION MURDER_FLAG
## 1    236168668 11/11/2021 15:04:00 BROOKLYN       79                false
## 2    231008085 07/16/2021 22:05:00 BROOKLYN       72                false
## 3    230717903 07/11/2021 01:09:00 BROOKLYN       79                false
## 4    237712309 12/11/2021 13:42:00 BROOKLYN       81                false
## 5    224465521 02/16/2021 20:00:00   QUEENS      113                false
## 6    228252164 05/15/2021 04:13:00   QUEENS      113                 true
##   PERP_AGE_GROUP PERP_SEX                PERP_RACE VICTIM_AGE VICTIM_SEX
## 1                                                       18-24          M
## 2          45-64        M ASIAN / PACIFIC ISLANDER      25-44          M
## 3            <18        M                    BLACK      25-44          M
## 4                                                       25-44          M
## 5                                                       25-44          M
## 6                                                       25-44          M
##                VICTIM_RACE LATITUDE LONGITUDE
## 1                    BLACK 40.68132 -73.95651
## 2 ASIAN / PACIFIC ISLANDER 40.63636 -74.00867
## 3                    BLACK 40.68114 -73.95567
## 4                    BLACK 40.69579 -73.93910
## 5                    BLACK 40.67374 -73.76041
## 6                    BLACK 40.70618 -73.75806

Check for Duplicates and Remove

Next, we will check if there are any missing or duplicated data points, focusing only on the INCIDENT_KEY feature for now. This feature will be the most important for identifying any duplicate entries as they should all be unique.

# Check for any NA or Null values
any(is.na(import_url$INCIDENT_KEY)) | any(is.null(import_url$INCIDENT_KEY))
## [1] FALSE
# Check for duplicates
length(unique(import_url$INCIDENT_KEY))
## [1] 20126
length(import_url$INCIDENT_KEY)
## [1] 25596

Subtracting the results here shows that there are 5470 duplicate data points. Let’s take a look to make sure these aren’t false positives.

# Sort duplicates to see what they look like, but does not change dataframe
head(filter(import_url, duplicated(import_url$INCIDENT_KEY)))
##   INCIDENT_KEY       DATE     TIME   BOROUGH PRECINCT                  LOCATION
## 1    229643172 06/16/2021 23:22:00     BRONX       52                          
## 2    236363733 11/16/2021 22:39:00 MANHATTAN       14                          
## 3    226542152 04/05/2021 22:10:00     BRONX       44 MULTI DWELL - PUBLIC HOUS
## 4    227647476 05/02/2021 18:18:00 MANHATTAN       23 MULTI DWELL - PUBLIC HOUS
## 5    232496781 08/19/2021 20:32:00  BROOKLYN       77                          
## 6    232390408 08/17/2021 22:20:00  BROOKLYN       73            GROCERY/BODEGA
##   MURDER_FLAG PERP_AGE_GROUP PERP_SEX      PERP_RACE VICTIM_AGE VICTIM_SEX
## 1       false          18-24        M WHITE HISPANIC      25-44          F
## 2       false            <18        M          BLACK      25-44          M
## 3       false                                             45-64          M
## 4       false          18-24        M WHITE HISPANIC      25-44          M
## 5       false          45-64        M          BLACK      45-64          F
## 6       false                                             18-24          M
##   VICTIM_RACE LATITUDE LONGITUDE
## 1       BLACK 40.86414 -73.89131
## 2       BLACK 40.75165 -73.98434
## 3       BLACK 40.83750 -73.92785
## 4       BLACK 40.78694 -73.94357
## 5       BLACK 40.67036 -73.92680
## 6       BLACK 40.66835 -73.90652
# Check a few entries
arrange(filter(import_url, INCIDENT_KEY == 227647476 | INCIDENT_KEY == 232390408), INCIDENT_KEY)
##   INCIDENT_KEY       DATE     TIME   BOROUGH PRECINCT                  LOCATION
## 1    227647476 05/02/2021 18:18:00 MANHATTAN       23 MULTI DWELL - PUBLIC HOUS
## 2    227647476 05/02/2021 18:18:00 MANHATTAN       23 MULTI DWELL - PUBLIC HOUS
## 3    227647476 05/02/2021 18:18:00 MANHATTAN       23 MULTI DWELL - PUBLIC HOUS
## 4    232390408 08/17/2021 22:20:00  BROOKLYN       73            GROCERY/BODEGA
## 5    232390408 08/17/2021 22:20:00  BROOKLYN       73            GROCERY/BODEGA
##   MURDER_FLAG PERP_AGE_GROUP PERP_SEX      PERP_RACE VICTIM_AGE VICTIM_SEX
## 1       false            <18        M          BLACK      25-44          M
## 2       false          18-24        M WHITE HISPANIC      25-44          M
## 3       false          18-24        M          BLACK      25-44          M
## 4       false                                             25-44          M
## 5       false                                             18-24          M
##   VICTIM_RACE LATITUDE LONGITUDE
## 1       BLACK 40.78694 -73.94357
## 2       BLACK 40.78694 -73.94357
## 3       BLACK 40.78694 -73.94357
## 4       BLACK 40.66835 -73.90652
## 5       BLACK 40.66835 -73.90652
# Remove duplicates
import_url <- filter(import_url, !duplicated(import_url$INCIDENT_KEY))
# Check work
sum(duplicated(import_url$INCIDENT_KEY))
## [1] 0

Change Feature Class Types

For better analysis we should change the class type of a few of these features to make them easier to work with.

# Character to Date and Period
import_url <- import_url %>% mutate(DATE = mdy(DATE)) %>%
        mutate(TIME = hms(TIME))

# Character to Factors - changes all character columns to factor
import_url <- import_url %>% mutate(across(where(is.character), as.factor))
str(import_url)
## 'data.frame':    20126 obs. of  15 variables:
##  $ INCIDENT_KEY  : int  236168668 231008085 230717903 237712309 224465521 228252164 226950018 237710987 224701998 225295736 ...
##  $ DATE          : Date, format: "2021-11-11" "2021-07-16" ...
##  $ TIME          :Formal class 'Period' [package "lubridate"] with 6 slots
##   .. ..@ .Data : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ year  : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ month : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ day   : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ hour  : num  15 22 1 13 20 4 21 19 0 6 ...
##   .. ..@ minute: num  4 5 9 42 0 13 8 30 18 15 ...
##  $ BOROUGH       : Factor w/ 5 levels "BRONX","BROOKLYN",..: 2 2 2 2 4 4 1 1 3 2 ...
##  $ PRECINCT      : int  79 72 79 81 113 113 42 52 34 75 ...
##  $ LOCATION      : Factor w/ 40 levels "","ATM","BANK",..: 1 1 1 1 1 1 10 1 1 1 ...
##  $ MURDER_FLAG   : Factor w/ 2 levels "false","true": 1 1 1 1 1 2 2 1 1 2 ...
##  $ PERP_AGE_GROUP: Factor w/ 10 levels "","<18","1020",..: 1 7 2 1 1 1 1 1 1 6 ...
##  $ PERP_SEX      : Factor w/ 4 levels "","F","M","U": 1 3 3 1 1 1 1 1 1 3 ...
##  $ PERP_RACE     : Factor w/ 8 levels "","AMERICAN INDIAN/ALASKAN NATIVE",..: 1 3 4 1 1 1 1 1 1 5 ...
##  $ VICTIM_AGE    : Factor w/ 6 levels "<18","18-24",..: 2 3 3 3 3 3 2 3 3 3 ...
##  $ VICTIM_SEX    : Factor w/ 3 levels "F","M","U": 2 2 2 2 2 2 2 2 2 2 ...
##  $ VICTIM_RACE   : Factor w/ 7 levels "AMERICAN INDIAN/ALASKAN NATIVE",..: 3 2 3 3 3 3 3 3 4 7 ...
##  $ LATITUDE      : num  40.7 40.6 40.7 40.7 40.7 ...
##  $ LONGITUDE     : num  -74 -74 -74 -73.9 -73.8 ...

More Feature Checks

We will continue to look at the features and see if any of these blank entries will cause trouble during the analysis. Also, we’ll look to see if there are any duplicate categorical factors in the rest of the features.

# Create a table of each column to check factor levels
for(i in 1:length(import_url)){
    ifelse(is.factor(import_url[ ,i]), print(table(import_url[ ,i, drop = FALSE])), next)
}
## BOROUGH
##         BRONX      BROOKLYN     MANHATTAN        QUEENS STATEN ISLAND 
##          5611          8357          2536          3031           591 
## LOCATION
##                                                 ATM                      BANK 
##                     11964                         1                         2 
##            BAR/NIGHT CLUB         BEAUTY/NAIL SALON               CANDY STORE 
##                       400                        77                         6 
##               CHAIN STORE                CHECK CASH         CLOTHING BOUTIQUE 
##                         5                         1                        11 
##           COMMERCIAL BLDG                DEPT STORE            DOCTOR/DENTIST 
##                       194                         5                         1 
##                DRUG STORE       DRY CLEANER/LAUNDRY         FACTORY/WAREHOUSE 
##                         7                        24                         5 
##                 FAST FOOD               GAS STATION            GROCERY/BODEGA 
##                        77                        50                       468 
##      GYM/FITNESS FACILITY                  HOSPITAL               HOTEL/MOTEL 
##                         3                        37                        25 
##             JEWELRY STORE              LIQUOR STORE              LOAN COMPANY 
##                         9                        24                         1 
##   MULTI DWELL - APT BUILD MULTI DWELL - PUBLIC HOUS                      NONE 
##                      2007                      3664                       140 
##          PHOTO/COPY STORE                 PVT HOUSE          RESTAURANT/DINER 
##                         1                       641                       153 
##                    SCHOOL                SHOE STORE            SMALL MERCHANT 
##                         1                         4                        18 
## SOCIAL CLUB/POLICY LOCATI          STORAGE FACILITY        STORE UNCLASSIFIED 
##                        43                         1                        28 
##               SUPERMARKET           TELECOMM. STORE             VARIETY STORE 
##                        14                         2                        10 
##               VIDEO STORE 
##                         2 
## MURDER_FLAG
## false  true 
## 16604  3522 
## PERP_AGE_GROUP
##             <18    1020   18-24     224   25-44   45-64     65+     940 UNKNOWN 
##    8120     993       1    4086       1    3769     374      44       1    2737 
## PERP_SEX
##           F     M     U 
##  8090   212 10469  1355 
## PERP_RACE
##                                AMERICAN INDIAN/ALASKAN NATIVE 
##                           8090                              2 
##       ASIAN / PACIFIC ISLANDER                          BLACK 
##                             88                           7871 
##                 BLACK HISPANIC                        UNKNOWN 
##                            803                           1627 
##                          WHITE                 WHITE HISPANIC 
##                            213                           1432 
## VICTIM_AGE
##     <18   18-24   25-44   45-64     65+ UNKNOWN 
##    1984    7585    9122    1273     125      37 
## VICTIM_SEX
##     F     M     U 
##  1522 18598     6 
## VICTIM_RACE
## AMERICAN INDIAN/ALASKAN NATIVE       ASIAN / PACIFIC ISLANDER 
##                              8                            260 
##                          BLACK                 BLACK HISPANIC 
##                          14742                           1854 
##                        UNKNOWN                          WHITE 
##                             50                            517 
##                 WHITE HISPANIC 
##                           2695

Incident Coordinate Data Visualized on a Map

Here we’re going to visualize the location of each shooting incident using the coordinates given in the dataset. First, we can use the minimum and maximum values of the longitudes and latitudes to find the map’s bounding box (edges). Then, use ggmap() to generate a map centered around these coordinates. Then, we can use geom_point() and stat_density2d_filled() to superimpose our data on the map using the same coordinate system we generated.

# Initialize the bounding box that will contain the map coordinates.
map_bounds <- c(left = min(import_url$LONGITUDE), 
        bottom = min(import_url$LATITUDE),
        right = max(import_url$LONGITUDE), 
        top = max(import_url$LATITUDE))

# Initialize the scatter plot of the incident coordinates
# Note, there are better maps out there but most require a private google API key,
# which wouldn't work for this public project.
incident_map_point <- ggmap(get_stamenmap(map_bounds, maptype = 'terrain', zoom = 11)) + 
        geom_point(data = import_url, 
                aes(x = LONGITUDE, y = LATITUDE),
                color = 'darkred', 
                size = 0.25, 
                alpha = 0.2) +
        ggtitle('Point Plot of NYPD Shooting Incident Reporting 2006 - 2021\n    Source:<https://catalog.data.gov/dataset/nypd-shooting-incident-data-historic>') +
        labs(x = 'LONGITUDE', y = 'LATITUDE')
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# Display Point Map
incident_map_point

# Initialize density map to better visualize regions with frequent incidents.
incident_map_density <- ggmap(get_stamenmap(map_bounds, maptype = 'terrain', zoom = 11)) + 
        stat_density2d_filled(data = import_url, contour_var = 'density',
                aes(x = LONGITUDE, y = LATITUDE, fill = after_stat(level)), 
                bins = 20, 
                geom = 'polygon', 
                alpha = 0.8) +
        geom_density_2d(data = import_url, 
                aes(x = LONGITUDE, y = LATITUDE), 
                bins = 20, 
                alpha = 0.2, 
                color = "white") +
        guides(fill = guide_legend(title = "Density")) + 
        ggtitle('Density Plot of NYPD Shooting Incident Reporting 2006 - 2021\n    Source:<https://catalog.data.gov/dataset/nypd-shooting-incident-data-historic>') +
        labs(x = 'LONGITUDE', y = 'LATITUDE')
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# Display Density Map
incident_map_density